#################
# load packages #
#################
library(dplyr)
library(stm)
library(quanteda)
library(quanteda.textplots)
library(quanteda.textstats)
library(quanteda.corpora)
library(stopwords)
library(ggplot2)
library(patchwork)

#####################
# load ad prep data #
#####################
## set working directory - insert relevant path in next line (and drop # sign)
  # setwd("[working directory path]")
## load data
corpus.cov <- read.csv("BSG & Yakter PSRM - text corpus processed.csv")

## create corpus and token objects
dat.corpus <- corpus(corpus.cov, docid_field = "uid", text_field = "trim", unique_docnames = FALSE)
dat.tok <- tokens(dat.corpus)
# use Quanteda's suggestion for (additional) Hebrew optimization
dat.tok <- tokens(dat.tok, remove_punct = FALSE, remove_numbers = TRUE) %>% 
  tokens_select(pattern = "^[\\p{script=Hebr}]+$", valuetype = "regex") %>% 
  tokens_remove(pattern = stopwords("he", source = "marimo"), min_nchar = 2)
print(dat.tok[2], max_ndoc = 1, max_ntoken = -1)

## create document-feature matrix (dfm) 
dat.dfm <- dfm(dat.tok)
dat.dfm <- dfm_keep(dat.dfm, min_nchar = 3)

##################################################
# keyness - violence vs. non-violence (Figure 4) #
##################################################
## run analysis
tstat_key.vl <- textstat_keyness(dat.dfm, docvars(dat.corpus, "tr_violence") == 1)
tstat_key.vl.top <- head(tstat_key.vl, 20)
tstat_key.vl.bottom <- tail(tstat_key.vl, 20)

## save files for manual translation to English (translate only top + bottom 20 words with minimum count of 5)
write.csv(tstat_key.vl.top,"BSG & Yakter PSRM - keyness heb violence.csv", row.names = FALSE)
write.csv(tstat_key.vl.bottom,"BSG & Yakter PSRM - keyness heb nonviolence.csv", row.names = FALSE)

## after manual translation, load and plot English version of the results
# load keyness scores with translation
tstat_key.vl.eng <- read.csv("BSG & Yakter PSRM - keyness eng violence.csv")
tstat_key.nvl.eng <- read.csv("BSG & Yakter PSRM - keyness eng nonviolence.csv")
# omit terms without English translation (i.e., below top 20 or with <5 count)
tstat_key.vl.eng <- tstat_key.vl.eng[!(!is.na(tstat_key.vl.eng$english) & tstat_key.vl.eng$english==""), ]
tstat_key.nvl.eng <- tstat_key.nvl.eng[!(!is.na(tstat_key.nvl.eng$english) & tstat_key.nvl.eng$english==""), ]
# order words by their chi2 values for the plot
tstat_key.vl.eng$english <- factor(tstat_key.vl.eng$english, levels = tstat_key.vl.eng[order(tstat_key.vl.eng$chi2), 2])
tstat_key.nvl.eng$english <- factor(tstat_key.nvl.eng$english, levels = tstat_key.nvl.eng[order(tstat_key.nvl.eng$chi2), 2])

## plot
# violence condition
plot.key.vl <- ggplot(data=tstat_key.vl.eng, aes(y=english,x=chi2)) +
  geom_col() +
  labs(x="Keyness Score (Chi Square)",title = "Violence Condition") +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        plot.title = element_text(size=11),
        axis.title.y = element_blank(),
  )
# non-violence condition
plot.key.nvl <- ggplot(data=tstat_key.nvl.eng, aes(y=english,x=chi2)) +
  geom_col() +
  labs(x="Keyness Score (Chi Square)",title = "Non-violence Condition") +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        plot.title = element_text(size=11),
        axis.title.y = element_blank(),
  )
# combine
plot.key.vl.both <- plot.key.vl + plot.key.nvl
# save file
ggsave(file = "Figure 4.pdf", plot=plot.key.vl.both, width=6.5, height=4)

#################################################
# keyness - left-wing vs. right-wing (Figure 5) #
#################################################
# sub-sample of nonviolent treatments only
dat.cor.nv <- corpus_subset(dat.corpus, tr_violence == 0)
dfm.nv <- dfm(tokens(dat.cor.nv))

# run analysis - right-wing respondents
tstat_key.rgt <- textstat_keyness(dfm.nv, docvars(dat.cor.nv, "vote_bloc_rcl") == 3)
# run analysis - left-wing respondents
tstat_key.lft <- textstat_keyness(dfm.nv, docvars(dat.cor.nv, "vote_bloc_rcl") == 1)

## save files for manual translation to English (translate only top 20 words with minimum count of 5)
write.csv(tstat_key.rgt,"BSG & Yakter PSRM - keyness heb right.csv", row.names = FALSE)
write.csv(tstat_key.lft,"BSG & Yakter PSRM - keyness heb left.csv", row.names = FALSE)

## after manual translation, load and plot English version of the results
# load keyness scores with translation
tstat_key.rgt.eng <- read.csv("BSG & Yakter PSRM - keyness eng right.csv")
tstat_key.lft.eng <- read.csv("BSG & Yakter PSRM - keyness eng left.csv")
# omit terms without English translation (i.e., below top 20 or with <5 count)
tstat_key.rgt.eng <- tstat_key.rgt.eng[!(!is.na(tstat_key.rgt.eng$english) & tstat_key.rgt.eng$english==""), ]
tstat_key.lft.eng <- tstat_key.lft.eng[!(!is.na(tstat_key.lft.eng$english) & tstat_key.lft.eng$english==""), ]
# order words by their chi2 values for the plot
tstat_key.rgt.eng$english <- factor(tstat_key.rgt.eng$english, levels = tstat_key.rgt.eng[order(tstat_key.rgt.eng$chi2), 2])
tstat_key.lft.eng$english <- factor(tstat_key.lft.eng$english, levels = tstat_key.lft.eng[order(tstat_key.lft.eng$chi2), 2])

## plot
# right-wingers
plot.key.rgt <- ggplot(data=tstat_key.rgt.eng, aes(y=english,x=chi2)) +
  geom_col() +
  labs(x="Keyness Score (Chi Square)",title = "Right-wing Respondents") +
  scale_x_continuous(limits = c(0,10), breaks=seq(0,10,by=2)) +
    theme_bw() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        plot.title = element_text(size=11),
        axis.title.y = element_blank(),
  )
# left wingers
plot.key.lft <- ggplot(data=tstat_key.lft.eng, aes(y=english,x=chi2)) +
  geom_col() +
  labs(x="Keyness Score (Chi Square)",title = "Left-wing Respondents") +
  theme_bw() +
  theme(legend.position = "none",
        panel.grid = element_blank(),
        plot.title = element_text(size=11),
        axis.title.y = element_blank(),
  )
# combine
plot.key.both <- plot.key.lft + plot.key.rgt

ggsave(file = "Figure 5.pdf", plot=plot.key.both, width=6.5, height=4)

##############################################
# robustness - manipulation check (Table A4) #
# for action-type treatment (Hebrew words)   #
##############################################
textstat_frequency(dat.dfm, n = 12, groups = tr_action)
# outcome in Hebrew, translated manually
